Libraries

Data

Data<- read.csv(file = 'Final/BoT_Cluster_ENGR_Updated.csv',row.names=1)
BoT <- Data[,-c(1,2)]

Clustering

Hierarchical Clustering (HC)

dcan <- dist(BoT, method = "euclidean") 
hcan <- hclust(dcan,method="complete")
plot(hcan, axes = FALSE, ann = FALSE, main = NA, labels = FALSE, hang = 0.01)

Table

We can see from the figure that the best choice for total number of clusters is 3. Comparing it with the original cohort id.

HC_cut <- cutree(hcan, 3)
table(Data$cohort_id, HC_cut)
##     HC_cut
##       1  2  3
##   1  28 23  5
##   6  25 25  8
##   10 36 37 13
##   11  2 10  2
##   12 17 12  3
##   15 27 21  7
##   16 15 15  1
##   17 21 21  4
##   22 60 78 19
##   28 16 26  8
##   30 17 26  3
##   35 23 27  5
##   39 21 30  9
##   40 19 20  7
##   41 26 16  1

Result

We will save the result to perform test.

Data_HC_Result <- BoT %>% mutate(Cluster = HC_cut)
(Data_HC_Result_Summarized <- BoT %>%
                                mutate(Cluster = HC_cut) %>%
                                group_by(Cluster) %>%
                                summarise_all("mean"))
## # A tibble: 3 × 9
##   Cluster Control SpeakUp Extraversion BT_PastGroups BT_PastPositive
##     <int>   <dbl>   <dbl>        <dbl>         <dbl>           <dbl>
## 1       1    3.64    5.22         4.99          2.69            4.00
## 2       2    3.44    3.68         4.36          2.60            3.81
## 3       3    4.83    3.41         2.72          2.61            3.61
## # … with 3 more variables: GroupPreference <dbl>, Procrastination <dbl>,
## #   BT_Belongingness <dbl>
write.csv(Data_HC_Result, "Final/Data_HC_Result.csv", row.names=FALSE)
write.csv(Data_HC_Result_Summarized, "Final/Data_HC_Result_Summarized.csv", row.names=FALSE)

Plot

Look at the histogram for each cluster

plt <- htmltools::tagList()
for(i in 1:8){ 
plt[[i]] <- as.widget(ggplotly(ggplot(Data_HC_Result, 
                      aes(x=as.factor(.data[[colnames(Data_HC_Result)[i]]]), fill=as.factor(Cluster), color=as.factor(Cluster))) +
  geom_histogram(position="dodge",stat="count") ))
}

Test

Using Kruskal-Wallis test as the non-parametric alternative to one-way ANOVA test, since I believe my data is not normally distributed. Also, the original data is of ordinal type.

# pairwise.wilcox.test(Data_HC_Result[["Control"]], Data_HC_Result$Cluster, p.adjust.method = "BH")
for(i in 1:8){ 
  print(pairwise.wilcox.test(Data_HC_Result[[colnames(Data_HC_Result)[i]]], Data_HC_Result$Cluster, p.adjust.method = "BH"))
}
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster 
## 
##   1       2      
## 2 0.092   -      
## 3 2.5e-12 < 2e-16
## 
## P value adjustment method: BH 
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster 
## 
##   1      2   
## 2 <2e-16 -   
## 3 <2e-16 0.04
## 
## P value adjustment method: BH 
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster 
## 
##   1      2     
## 2 5e-11  -     
## 3 <2e-16 <2e-16
## 
## P value adjustment method: BH 
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster 
## 
##   1    2   
## 2 0.12 -   
## 3 0.26 0.99
## 
## P value adjustment method: BH 
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster 
## 
##   1       2      
## 2 0.00085 -      
## 3 1.6e-05 0.01404
## 
## P value adjustment method: BH 
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster 
## 
##   1    2   
## 2 0.64 -   
## 3 0.19 0.19
## 
## P value adjustment method: BH 
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster 
## 
##   1      2    
## 2 <2e-16 -    
## 3 0.3    1e-14
## 
## P value adjustment method: BH 
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster 
## 
##   1       2
## 2 5.0e-14 -
## 3 1.3e-08 1
## 
## P value adjustment method: BH

Kmeans

Optimal No. of Clusters

ggplotly(fviz_nbclust(BoT, kmeans, method = "wss"))
ggplotly(fviz_nbclust(BoT, kmeans, method = "sil"))
fviz_nbclust(BoT, kmeans, method = "gap_stat")

Kmeans clustering

We can see from various figures that the best choice for total number of clusters is either 2 (wss and silhouette) or 3 (gap_stats and dendogram from HC). For this paper, we will choose 3 clusters for consistency with HC.

k3 <- kmeans(BoT, centers = 3, nstart = 25)
fviz_cluster(k3, data = BoT,geom = "point")

Table

table(Data$cohort_id, k3$cluster)
##     
##       1  2  3
##   1  23 16 17
##   6  13 23 22
##   10 16 30 40
##   11  4  8  2
##   12 11  6 15
##   15 18 16 21
##   16  7 10 14
##   17 10 16 20
##   22 39 69 49
##   28  9 29 12
##   30 15 15 16
##   35 18 19 18
##   39 19 22 19
##   40 11 21 14
##   41 20 10 13

Result

We will save the result to perform test.

Data_Kmeans_Result <- BoT %>% mutate(Cluster = k3$cluster) 

(Data_Kmeans_Result_Summarized <- BoT %>%
                    mutate(Cluster = k3$cluster) %>%
                    group_by(Cluster) %>%
                    summarise_all("mean"))
## # A tibble: 3 × 9
##   Cluster Control SpeakUp Extraversion BT_PastGroups BT_PastPositive
##     <int>   <dbl>   <dbl>        <dbl>         <dbl>           <dbl>
## 1       1    5.04    4.97         5.39          2.74            3.81
## 2       2    3.82    3.24         3.10          2.43            3.69
## 3       3    2.46    4.89         5.10          2.78            4.11
## # … with 3 more variables: GroupPreference <dbl>, Procrastination <dbl>,
## #   BT_Belongingness <dbl>
write.csv(Data_Kmeans_Result, "Final/Data_Kmeans_Result.csv", row.names=FALSE)
write.csv(Data_Kmeans_Result_Summarized, "Final/Data_Kmeans_Result_Summarized.csv", row.names=FALSE)

Plot

Look at the histogram for each cluster

plt2 <- htmltools::tagList()
for(i in 1:8){ 
  plt2[[i]] <- as.widget(ggplotly(ggplot(Data_Kmeans_Result, 
                      aes(x=as.factor(.data[[colnames(Data_Kmeans_Result)[i]]]), fill=as.factor(Cluster), color=as.factor(Cluster))) +
  geom_histogram(position="dodge",stat="count") ))
}

Test

Using Kruskal-Wallis test as the non-parametric alternative to one-way ANOVA test, since I believe my data is not normally distributed. Also, the original data is of ordinal type.

# pairwise.wilcox.test(Data_HC_Result[["Control"]], Data_HC_Result$Cluster, p.adjust.method = "BH")
for(i in 1:8){ 
  print(pairwise.wilcox.test(Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]], Data_Kmeans_Result$Cluster, p.adjust.method = "BH"))
}
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster 
## 
##   1      2     
## 2 <2e-16 -     
## 3 <2e-16 <2e-16
## 
## P value adjustment method: BH 
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster 
## 
##   1      2     
## 2 <2e-16 -     
## 3 0.43   <2e-16
## 
## P value adjustment method: BH 
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster 
## 
##   1      2     
## 2 <2e-16 -     
## 3 0.0057 <2e-16
## 
## P value adjustment method: BH 
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster 
## 
##   1       2      
## 2 3.3e-09 -      
## 3 0.55    3.7e-12
## 
## P value adjustment method: BH 
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster 
## 
##   1       2      
## 2 0.12    -      
## 3 9.2e-07 9.8e-12
## 
## P value adjustment method: BH 
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster 
## 
##   1       2      
## 2 0.46    -      
## 3 6.1e-13 2.1e-15
## 
## P value adjustment method: BH 
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster 
## 
##   1       2      
## 2 0.00073 -      
## 3 1.4e-05 0.13691
## 
## P value adjustment method: BH 
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster 
## 
##   1      2     
## 2 <2e-16 -     
## 3 0.23   <2e-16
## 
## P value adjustment method: BH